%run set_theme.ipynb
import plotly.graph_objs as go
import plotly.express as px
import pandas as pd
from plotly.offline import init_notebook_mode
init_notebook_mode()
df = pd.read_parquet('../data/SO_2014_2022.pq')
avg_salary = df.groupby(['YearsCodePro', 'Gender'])['Salary'].mean().reset_index()
avg_salary
| YearsCodePro | Gender | Salary | |
|---|---|---|---|
| 0 | 0 | female | 39553.009805 |
| 1 | 0 | male | 32276.145762 |
| 2 | 1 | female | 39444.444886 |
| 3 | 1 | male | 32361.590644 |
| 4 | 2 | female | 41009.217794 |
| ... | ... | ... | ... |
| 95 | 48 | male | 114928.0 |
| 96 | 49 | female | 8.0 |
| 97 | 49 | male | 117607.3 |
| 98 | 50 | female | 133300.0 |
| 99 | 50 | male | 84511.262295 |
100 rows × 3 columns
smoothed_avg = pd.DataFrame(columns=['Gender', 'Salary'])
for gender in ['male', 'female']:
start_salary = df.query(f"YearsCodePro < 2 & Gender == '{gender}'")['Salary'].mean()
x = list(avg_salary.query(f"Gender == '{gender}'").drop(columns=['Gender']).div({'YearsCodePro':1, 'Salary':start_salary}).sub({'YearsCodePro':0, 'Salary':1}).rolling(3, on='YearsCodePro'))
smoothed = pd.Series(row.mean()['Salary'].round(3) for row in x[2:]).rename('Salary')
# smoothed = avg_salary.query(f"Gender == '{gender}'")['Salary'].reset_index(drop=True).div(start_salary).sub(1).rename('Salary')
gen_col = pd.Series(gender for _ in range(len(smoothed))).rename('Gender')
smoothed.index += 1
gen_col.index += 1
smoothed_avg = pd.concat([smoothed_avg, pd.concat([gen_col, smoothed], axis=1)])
smoothed_avg.reset_index(names='YearsCodePro', inplace=True)
smoothed_avg
C:\Users\Efe\AppData\Local\Temp\ipykernel_4340\510673352.py:4: RuntimeWarning:
Engine has switched to 'python' because numexpr does not support extension array dtypes. Please set your engine to python manually.
C:\Users\Efe\AppData\Local\Temp\ipykernel_4340\510673352.py:11: FutureWarning:
The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.
C:\Users\Efe\AppData\Local\Temp\ipykernel_4340\510673352.py:4: RuntimeWarning:
Engine has switched to 'python' because numexpr does not support extension array dtypes. Please set your engine to python manually.
| YearsCodePro | Gender | Salary | |
|---|---|---|---|
| 0 | 1 | male | 0.029 |
| 1 | 2 | male | 0.164 |
| 2 | 3 | male | 0.302 |
| 3 | 4 | male | 0.530 |
| 4 | 5 | male | 0.712 |
| ... | ... | ... | ... |
| 91 | 43 | female | 1.304 |
| 92 | 44 | female | 1.970 |
| 93 | 45 | female | 1.407 |
| 94 | 46 | female | 0.605 |
| 95 | 47 | female | 0.801 |
96 rows × 3 columns
fig = px.line(
smoothed_avg,
x="YearsCodePro",
y='Salary',
color='Gender',
color_discrete_map={
'male': '#5b6fec',
'female': '#f854ee'
},
range_x=[1, 40],
range_y=[0, 3],
)
fig.update_layout(
xaxis_title="Years coded professionally",
yaxis_title="Average salary increase",
title='Coding Experience vs Relative Salary Increase<br><sup>After five years men receive higher salary increase for the same work experience</sup>',
yaxis_tickformat='.0%'
)
fig.update_layout(
hovermode="x",
hoverlabel={'font_color': 'white', 'bordercolor': 'white'},
legend={
'title': '',
'orientation': 'h',
'xanchor': 'center',
'yanchor': 'top',
'x': 0.47,
'y': 1.085,
'itemwidth': 45
},
margin={'t': 100, 'r': 50, 'b': 130, 'l': 110},
width=790,
)
# Add caption
fig.add_annotation(x=-0.12, y=-0.35,
showarrow=False,
xref='paper', yref='paper',
xanchor='left', yanchor='bottom',
align='left',
text='This shows the years of professional coding experience compared to salary increase relative to starting salary.<br>' +
'Hover over the graph from left to right to compare the increase for both genders.')
for trace in fig.data:
trace.hovertemplate = '<br>Salary increase: %{y}<extra></extra>'
fig.show()